Load required libraries

# Data wrangling
library(lubridate)
library(dplyr)

# For plotting
library(ggplot2)

# For regression tree
# library(MASS)
library(tree)

# For bagging and random forest models
library(randomForest)

### For the boosting model
library(gbm)

### For tuning models
library(caret)

Load and clean up data

load("data-raw/ListOfAllHeatwaves.Rdata")
fix_colnames <- which(colnames(all.hws) == "mean.temp")
colnames(all.hws)[fix_colnames[2]] <- "mean.temp.1"

hw <- all.hws %>%
  mutate(start.year = year(start.date))
colnames(hw) <- gsub(" ", ".", colnames(hw))

Create dataframes for fitting models for y1 and y2

hw <- hw %>%
  select(-hw.number, -start.date, -end.date, -id, -Std..Error, 
         -city, -Posterior.Estimate, -Posterior.SE, -post.se)

to_fit_y1 <- hw %>%
  select(-post.estimate)

to_fit_y2 <- hw %>%
  select(-Estimate)

# Create a random subset of heat waves to use to train models
set.seed(1001) ##generate fixed random numbers
hw_indices <- 1:nrow(hw)

train <- sort(sample(hw_indices, length(hw_indices) / 2))
head(train)
## [1]  2  3  5  6  9 11
test <- hw_indices[-train]
head(test)
## [1]  1  4  7  8 10 12

Fit a regression tree for \(Y_1\):

tree.hw <- tree(Estimate ~ ., data = to_fit_y1, subset = train)
summary(tree.hw)
## 
## Regression tree:
## tree(formula = Estimate ~ ., data = to_fit_y1, subset = train)
## Variables actually used in tree construction:
## [1] "start.doy"         "min.temp.quantile" "Ppoverty"         
## Number of terminal nodes:  4 
## Residual mean deviance:  0.03799 = 56.46 / 1486 
## Distribution of residuals:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -2.37800 -0.09941  0.01519  0.00000  0.11770  0.85470

The variables included in this model are: start.doy, min.temp.quantile, Ppoverty. Here is a plot of the final tree for this model:

plot(tree.hw)
text(tree.hw, pretty = 0)

Here are the results of using cross-validation to decide whether (and how much) to prune the tree:

cv.hw <- cv.tree(tree.hw)
plot(cv.hw$size, cv.hw$dev, type='b')

Based on this analysis, the best tree size would be 4 nodes.

Using the unpruned tree, here are predictions on the holdout, training dataset:

yhat <- predict(tree.hw, newdata = to_fit_y1[test, ])

to_plot <- data.frame(estimated_y1 = yhat,
                      true_y1 = to_fit_y1[test, "Estimate"])
ggplot(to_plot, aes(x = estimated_y1, y = true_y1)) + 
  geom_point(alpha = 0.2) + 
  xlim(range(to_plot)) + ylim(range(to_plot)) + 
  theme_minimal() + 
  geom_abline(intercept = 0, slope = 1) + 
  xlab("Estimated Y_1") + ylab("True Y_1")

The mean squared error for this model was 1.332097110^{-5}.

Fit a bagging model:

set.seed(1)
bag.hw <- randomForest(Estimate ~ ., 
                       data = to_fit_y1, subset = train,
                       mtry = ncol(to_fit_y1) - 1, importance = TRUE)

yhat.bag <- predict(bag.hw, newdata = to_fit_y1[test, ])

to_plot <- data.frame(estimated_y1 = yhat.bag,
                      true_y1 = to_fit_y1[test, "Estimate"])
ggplot(to_plot, aes(x = estimated_y1, y = true_y1)) + 
  geom_point(alpha = 0.2) + 
  xlim(range(to_plot)) + ylim(range(to_plot)) + 
  theme_minimal() + 
  geom_abline(intercept = 0, slope = 1) + 
  xlab("Estimated Y_1") + ylab("True Y_1")

The mean squared error for this model was 4.533258410^{-5}.

Here is more on the variable importance for this model:

importance(bag.hw)
##                      %IncMSE IncNodePurity
## mean.temp           7.127092    2.43573258
## max.temp           12.435896    1.93576641
## min.temp           10.426944    1.48729252
## length              6.082851    0.98018861
## start.doy           8.867151    8.44876104
## start.month         5.545108    0.53208653
## days.above.80       6.241774    0.64997029
## days.above.85       6.531809    0.29048742
## days.above.90       5.105985    0.21477811
## days.above.95       2.112760    0.03599527
## days.above.99th     2.167899    0.52560419
## days.above.99.5th   2.985387    0.29941594
## first.in.season     2.306548    0.72265838
## mean.temp.quantile  2.742567    3.99990699
## max.temp.quantile   2.970850    4.50393698
## min.temp.quantile   5.930149    4.25589726
## pop100              8.545890    3.04306541
## Ppoverty            8.651908    3.97522202
## Ppoverty75p         6.833760    2.09244802
## Purban              9.665649    1.51728436
## P75p               10.044029    1.88331991
## pop.density         9.685492    1.61661642
## mean.temp.1         7.041676    1.70637697
## mean.summer.temp   10.088439    1.43119714
## start.year         10.234676    7.86766924
varImpPlot(bag.hw)

Fitting a random forest model, with tuning:

# Use 10-fold cross validation for tuning to find the best `mtry`
fitControl <- trainControl(method = "cv", number = 10)

set.seed(1)
tuning.rf.hw <- train(Estimate ~ ., data = to_fit_y1, subset = train,
                   method = "rf", trControl = fitControl, ntree = 10,
                   importance = TRUE, metric="RMSE",
                   maximize = FALSE, tuneLength=5)

Here are the results from that tuning process:

tuning.rf.hw
## Random Forest 
## 
## 2980 samples
##   25 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1342, 1341, 1339, 1339, 1342, 1342, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE       Rsquared  
##    2    0.2041435  0.02051843
##    7    0.2081761  0.02619107
##   13    0.2025300  0.05091675
##   19    0.2022381  0.05765925
##   25    0.2036988  0.04836574
## 
## RMSE was used to select the optimal model using  the smallest value.
## The final value used for the model was mtry = 19.
plot(tuning.rf.hw)

Based on this tuning, the best value for mtry in the random forest model for this outcome is 19.

yhat.rf <- predict(tuning.rf.hw$finalModel, newdata = to_fit_y1[test, ])

to_plot <- data.frame(estimated_y1 = yhat.rf,
                      true_y1 = to_fit_y1[test, "Estimate"])
ggplot(to_plot, aes(x = estimated_y1, y = true_y1)) + 
  geom_point(alpha = 0.2) + 
  xlim(range(to_plot)) + ylim(range(to_plot)) + 
  theme_minimal() + 
  geom_abline(intercept = 0, slope = 1) + 
  xlab("Estimated Y_1") + ylab("True Y_1")

The mean squared error for this model was 1.093186510^{-5}.

Here are the variable importance plots:

importance(tuning.rf.hw$finalModel)
##                        %IncMSE IncNodePurity
## mean.temp           2.09145601   2.201291869
## max.temp            1.59144388   1.957183048
## min.temp            3.17867200   1.495273519
## length              1.38093988   1.646819207
## start.doy           1.50580173   6.749238062
## start.month         1.36149982   0.800102981
## days.above.80       1.99678188   0.934009110
## days.above.85       1.31846692   0.281415382
## days.above.90       1.68037566   0.284349229
## days.above.95      -1.05409255   0.000438401
## days.above.99th     1.46687174   0.617808370
## days.above.99.5th   2.03173084   0.443275791
## first.in.season    -1.06282656   0.697164009
## mean.temp.quantile  1.99793943   4.278970715
## max.temp.quantile   2.38871728   4.321199142
## min.temp.quantile   3.10884927   3.762528078
## pop100              1.24118549   3.856162140
## Ppoverty           -1.09203395   4.870776390
## Ppoverty75p         0.36232123   1.527727356
## Purban             -0.85326202   1.498955910
## P75p                2.33854251   1.996748391
## pop.density        -0.05911208   2.259959535
## mean.temp.1         3.07875722   1.907943862
## mean.summer.temp    1.75185187   2.953429050
## start.year         -0.64510490   4.932279441
varImpPlot(tuning.rf.hw$finalModel)

Fit a boosting model:

#### Tuning to find best mtry for Boostingt
gbmgrid=expand.grid(.interaction.depth = seq(1, 7, by = 2),
                    .n.trees = seq(200,300,by = 50),
                    .shrinkage =0.01,
                    .n.minobsinnode=10)
set.seed(1)
## I've already tested shrinkage=0.1, which is not good as shrinkage=0.01, to speed the process, I cut 0.1 off.



tuning.boost.hw=train(Estimate~.,data=hw[train,],
                      method="gbm",
                      tuneGrid=gbmgrid,
                      trControl=fitControl,
                      verbose=FALSE)
## Loading required package: plyr
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following object is masked from 'package:lubridate':
## 
##     here
tuning.boost.hw
## Stochastic Gradient Boosting 
## 
## 1490 samples
##   26 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1342, 1341, 1339, 1339, 1342, 1342, ... 
## Resampling results across tuning parameters:
## 
##   interaction.depth  n.trees  RMSE        Rsquared 
##   1                  200      0.12337304  0.6630695
##   1                  250      0.12028095  0.6677372
##   1                  300      0.11854686  0.6699416
##   3                  200      0.10794449  0.7426040
##   3                  250      0.10367313  0.7528640
##   3                  300      0.10124069  0.7588751
##   5                  200      0.09906492  0.7865518
##   5                  250      0.09424685  0.7967211
##   5                  300      0.09121802  0.8043525
##   7                  200      0.09346555  0.8114404
##   7                  250      0.08827967  0.8213362
##   7                  300      0.08519342  0.8276194
## 
## Tuning parameter 'shrinkage' was held constant at a value of 0.01
## 
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## RMSE was used to select the optimal model using  the smallest value.
## The final values used for the model were n.trees = 300,
##  interaction.depth = 7, shrinkage = 0.01 and n.minobsinnode = 10.
plot(tuning.boost.hw)

## The plot has the optimal value at interaction.depth=5 and n.trees=200. 



set.seed(1)
boost.hw=gbm(Estimate~.,data = hw[train,],distribution = "gaussian",n.trees=200,interaction.depth = 5)
summary(boost.hw)

##                                   var     rel.inf
## post.estimate           post.estimate 88.52329512
## pop100                         pop100  5.94188732
## Ppoverty                     Ppoverty  1.63204641
## length                         length  1.10142805
## max.temp.quantile   max.temp.quantile  0.92279435
## start.doy                   start.doy  0.42099320
## Ppoverty75p               Ppoverty75p  0.37233281
## days.above.80           days.above.80  0.33361266
## start.year                 start.year  0.19736944
## mean.temp.1               mean.temp.1  0.15951344
## Purban                         Purban  0.10269778
## min.temp.quantile   min.temp.quantile  0.09067863
## mean.temp.quantile mean.temp.quantile  0.07248359
## mean.summer.temp     mean.summer.temp  0.05772668
## mean.temp                   mean.temp  0.03613501
## pop.density               pop.density  0.03500551
## max.temp                     max.temp  0.00000000
## min.temp                     min.temp  0.00000000
## start.month               start.month  0.00000000
## days.above.85           days.above.85  0.00000000
## days.above.90           days.above.90  0.00000000
## days.above.95           days.above.95  0.00000000
## days.above.99th       days.above.99th  0.00000000
## days.above.99.5th   days.above.99.5th  0.00000000
## first.in.season       first.in.season  0.00000000
## P75p                             P75p  0.00000000
par(mfrow =c(1,3))
plot( boost.hw ,i="pop100")
plot( boost.hw,i="Ppoverty")
plot(boost.hw,i="max.temp.quantile")

yhat.boost = predict(boost.hw, newdata = hw[-train ,],n.trees =200)
hw.test <- hw[-train, "Estimate"]
mean((yhat.boost - hw.test)^2)
## [1] 0.02738211
## MSE=0.03561743
#### If response is post.est
load("data-raw/ListOfAllHeatwaves.Rdata")
hw=data.frame(all.hws)
colnames(hw)[28]<="mean.temp.1"
## [1] FALSE
hw$start.date=year(all.hws$start.date)
hw=hw[,-c(1,7,20:22,31:33,35)]
### Single Tree
set.seed(1) ##generate fixed random numbers
train = sample (1: nrow (hw), nrow(hw)/2)
tree.hw = tree(post.estimate~.,hw, subset = train )
summary (tree.hw)
## 
## Regression tree:
## tree(formula = post.estimate ~ ., data = hw, subset = train)
## Variables actually used in tree construction:
## [1] "mean.temp.quantile" "pop.density"        "mean.temp"         
## [4] "days.above.95"      "max.temp"           "Purban"            
## [7] "start.doy"         
## Number of terminal nodes:  8 
## Residual mean deviance:  0.0007425 = 1.1 / 1482 
## Distribution of residuals:
##       Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
## -0.1008000 -0.0175300 -0.0008852  0.0000000  0.0159900  0.1242000
plot(tree.hw)
text(tree.hw,pretty=0)

cv.hw =cv.tree(tree.hw)
plot(cv.hw$size,cv.hw$dev,type='b')

## The plot tells us the best would be 4 node.

## Continue using Prune
prune.hw =prune.tree(tree.hw,best=4)
plot(prune.hw)
text(prune.hw,pretty =0)

## disregard pruned tree, but using unpruned tree
yhat=predict(tree.hw,newdata=hw[-train,])
hw.test=hw[-train ,'post.estimate']
plot(yhat,hw.test)
abline(0 ,1)

mean((yhat - hw.test)^2)
## [1] 0.0009294525
## MSE=0.0009294525 Really lower!
#### Bagging
library(randomForest)
set.seed(1)
bag.hw=randomForest(post.estimate~.,data=hw,subset=train,mtry=25,importance=TRUE)
yhat.bag=predict(bag.hw,newdata=hw[-train ,])
plot(yhat.bag,hw.test)
abline(0,1)

mean((yhat.bag - hw.test)^2)
## [1] 0.0009133906
## MSE=0.0009133906 does't change a lot
importance(bag.hw)
##                      %IncMSE IncNodePurity
## mean.temp          12.681832    0.06342193
## max.temp           10.172925    0.04724131
## min.temp            9.987601    0.03824045
## length             10.332287    0.04498540
## start.date          7.115077    0.09209639
## start.doy           7.543092    0.12534681
## start.month         3.338849    0.01133792
## days.above.80      10.430266    0.03254294
## days.above.85       5.168632    0.01499552
## days.above.90       1.654415    0.01457275
## days.above.95       1.419662    0.01664198
## days.above.99th     8.003545    0.02054583
## days.above.99.5th   5.606331    0.01505929
## first.in.season     4.850694    0.01800880
## mean.temp.quantile 20.771740    0.11276662
## max.temp.quantile  17.800269    0.09103319
## min.temp.quantile   8.327124    0.07477411
## pop100              7.404953    0.06842809
## Ppoverty            8.288051    0.03514406
## Ppoverty75p         6.296062    0.03107052
## Purban             10.532977    0.03156202
## P75p                4.419545    0.04774665
## pop.density        13.058998    0.04261892
## mean.temp.1        10.582677    0.03965594
## mean.summer.temp   13.930181    0.03465096
varImpPlot(bag.hw)

#### random forest
#### Tuning to find best mtry for Random Forest
set.seed(1)
tuning.rf.hw=train(post.estimate~.,
                   data=hw,subset=train,
                   method = "rf",
                   trControl = fitControl,
                   ntree = 10,
                   importance = TRUE,
                   metric="RMSE",
                   maximize = FALSE,
                   tuneLength=5)
tuning.rf.hw
## Random Forest 
## 
## 2980 samples
##   25 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1342, 1341, 1339, 1339, 1342, 1342, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE        Rsquared  
##    2    0.02952687  0.03094266
##    7    0.02955921  0.03628967
##   13    0.02975596  0.03614151
##   19    0.02994947  0.03070839
##   25    0.02946487  0.04046570
## 
## RMSE was used to select the optimal model using  the smallest value.
## The final value used for the model was mtry = 25.
plot(tuning.rf.hw)

#### the optimal mtry is 25.


set.seed(1)
rf.hw=randomForest(post.estimate~.,data=hw,subset=train,mtry=25,importance=TRUE)
yhat.bag=predict(rf.hw,newdata=hw[-train ,])
plot(yhat.bag,hw.test)
abline(0,1)

mean((yhat.bag - hw.test)^2)
## [1] 0.0009133906
## MSE=0.0009133906 doesn't change a lot
importance(rf.hw)
##                      %IncMSE IncNodePurity
## mean.temp          12.681832    0.06342193
## max.temp           10.172925    0.04724131
## min.temp            9.987601    0.03824045
## length             10.332287    0.04498540
## start.date          7.115077    0.09209639
## start.doy           7.543092    0.12534681
## start.month         3.338849    0.01133792
## days.above.80      10.430266    0.03254294
## days.above.85       5.168632    0.01499552
## days.above.90       1.654415    0.01457275
## days.above.95       1.419662    0.01664198
## days.above.99th     8.003545    0.02054583
## days.above.99.5th   5.606331    0.01505929
## first.in.season     4.850694    0.01800880
## mean.temp.quantile 20.771740    0.11276662
## max.temp.quantile  17.800269    0.09103319
## min.temp.quantile   8.327124    0.07477411
## pop100              7.404953    0.06842809
## Ppoverty            8.288051    0.03514406
## Ppoverty75p         6.296062    0.03107052
## Purban             10.532977    0.03156202
## P75p                4.419545    0.04774665
## pop.density        13.058998    0.04261892
## mean.temp.1        10.582677    0.03965594
## mean.summer.temp   13.930181    0.03465096
varImpPlot(rf.hw)

### boosting
#### Tuning to find best mtry for Boostingt
gbmgrid=expand.grid(.interaction.depth =seq(3,7,by=2),
                    .n.trees = c(200,250,300),
                    .shrinkage =0.01,
                    .n.minobsinnode=10)
### shrinkage=0.1 is too bad, the optimal depth is 7 and n.trees=300 after testing several combinations

set.seed(1)
tuning.boost.hw=train(post.estimate~.,data=hw[train,],
                      method="gbm",
                      tuneGrid=gbmgrid,
                      trControl=fitControl,
                      verbose=FALSE)
tuning.boost.hw
## Stochastic Gradient Boosting 
## 
## 1490 samples
##   25 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1342, 1341, 1339, 1339, 1342, 1342, ... 
## Resampling results across tuning parameters:
## 
##   interaction.depth  n.trees  RMSE        Rsquared  
##   3                  200      0.02800774  0.06218608
##   3                  250      0.02796891  0.06305961
##   3                  300      0.02793282  0.06466670
##   5                  200      0.02787142  0.06951926
##   5                  250      0.02782948  0.07136949
##   5                  300      0.02779028  0.07275341
##   7                  200      0.02774866  0.07740408
##   7                  250      0.02771617  0.07814106
##   7                  300      0.02770656  0.07820711
## 
## Tuning parameter 'shrinkage' was held constant at a value of 0.01
## 
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## RMSE was used to select the optimal model using  the smallest value.
## The final values used for the model were n.trees = 300,
##  interaction.depth = 7, shrinkage = 0.01 and n.minobsinnode = 10.
plot(tuning.boost.hw)

set.seed(1)
boost.hw=gbm(post.estimate~.,data = hw[train,],distribution = "gaussian",n.trees=300,interaction.depth = 7)
summary(boost.hw)

##                                   var     rel.inf
## mean.temp.quantile mean.temp.quantile 15.91038293
## start.doy                   start.doy 10.41904970
## pop100                         pop100  7.08397301
## pop.density               pop.density  6.98980330
## mean.summer.temp     mean.summer.temp  6.77432345
## start.date                 start.date  6.04210733
## length                         length  5.89611867
## mean.temp.1               mean.temp.1  5.70701630
## max.temp                     max.temp  4.94489007
## max.temp.quantile   max.temp.quantile  4.86514295
## mean.temp                   mean.temp  3.32831254
## min.temp.quantile   min.temp.quantile  3.19474366
## P75p                             P75p  3.04572945
## Purban                         Purban  3.02447910
## Ppoverty                     Ppoverty  2.94070082
## days.above.80           days.above.80  2.52964834
## min.temp                     min.temp  1.75729102
## days.above.85           days.above.85  1.16761793
## days.above.99th       days.above.99th  1.11275935
## first.in.season       first.in.season  1.08215392
## days.above.90           days.above.90  1.06099544
## Ppoverty75p               Ppoverty75p  0.76771776
## days.above.95           days.above.95  0.26854035
## start.month               start.month  0.08650262
## days.above.99.5th   days.above.99.5th  0.00000000
par(mfrow =c(1,2))
plot( boost.hw ,i="mean.temp.quantile")
plot( boost.hw,i="start.doy")

yhat.boost = predict(boost.hw, newdata = hw[-train ,],n.trees =300)
mean((yhat.boost - hw.test)^2)
## [1] 0.0009912703
## MSE=0.0009912703